# This script prepares the different imputed, oversampled and undersampled training datasets to develop the preschool CAPP models. 
# These preschool training datasets prepared in this script will have had the following optimisation techniques applied: MICE imputation > ADASYN oversampling > random undersampling to give 1:1 class balance
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "MICE_imputed_standardised_preschool_training_dataset_1185ID.csv" is found in IOWBC_imputed_data.xlsx, sheet: "Standardised MICE preschool training"
# The data in files named "MICE_imputed_oversampled_preschool_dataset_XXX.csv" were developed using the script "Data_preparation_CAPP_imputation_oversampling.txt (data can be found in XXX).
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle


# Import datasets
data_0 = pd.read_csv("MICE_imputed_standardised_preschool_training_dataset_1185ID.csv", index_col=False)
del data_0['Unnamed: 0']
# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:176,]
data_0b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0b = shuffle(data_0b, random_state=123)
print('Original dataset shape %s' % Counter(data_0b.Asthma_10YR))
# Original dataset shape Counter({0: 176, 1: 176})


data_25 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_25%.csv", index_col=False)
data_25 = data_25.iloc[0:1229,:]
# Undersample the controls 
s1 = data_25.loc[data_25['Asthma_10YR'] == 1]
s0 = data_25.loc[data_25['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:220,]
data_25b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_25b = shuffle(data_25b, random_state=123)
print('Original dataset shape %s' % Counter(data_25b.Asthma_10YR))
# Original dataset shape Counter({1: 220, 0: 220})


data_50 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_50%.csv", index_col=False)
data_50 = data_50.iloc[0:1273,:]
# Undersample the controls 
s1 = data_50.loc[data_50['Asthma_10YR'] == 1]
s0 = data_50.loc[data_50['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:264,]
data_50b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_50b = shuffle(data_50b, random_state=123)
print('Original dataset shape %s' % Counter(data_50b.Asthma_10YR))
# Original dataset shape Counter({1: 264, 0: 264})


data_100 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_100%.csv", index_col=False)
data_100 = data_100.iloc[0:1361,:]
# Undersample the controls 
s1 = data_100.loc[data_100['Asthma_10YR'] == 1]
s0 = data_100.loc[data_100['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:352,]
data_100b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_100b = shuffle(data_100b, random_state=123)
print('Original dataset shape %s' % Counter(data_100b.Asthma_10YR))
# Original dataset shape Counter({1: 352, 0: 352})

data_150 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_150%.csv", index_col=False)
data_150 = data_150.iloc[0:1449,:]
# Undersample the controls 
s1 = data_150.loc[data_150['Asthma_10YR'] == 1]
s0 = data_150.loc[data_150['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:440,]
data_150b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_150b = shuffle(data_150b, random_state=123)
print('Original dataset shape %s' % Counter(data_150b.Asthma_10YR))
# Original dataset shape Counter({1: 440, 0: 440})

data_200 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_200%.csv", index_col=False)
data_200 = data_200.iloc[0:1537,:]
# Undersample the controls 
s1 = data_200.loc[data_200['Asthma_10YR'] == 1]
s0 = data_200.loc[data_200['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:528,]
data_200b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_200b = shuffle(data_200b, random_state=123)
print('Original dataset shape %s' % Counter(data_200b.Asthma_10YR))
# Original dataset shape Counter({1: 528, 0: 528})

data_250 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_250%.csv", index_col=False)
data_250 = data_250.iloc[0:1625,:]
# Undersample the controls 
s1 = data_250.loc[data_250['Asthma_10YR'] == 1]
s0 = data_250.loc[data_250['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:616,]
data_250b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_250b = shuffle(data_250b, random_state=123)
print('Original dataset shape %s' % Counter(data_250b.Asthma_10YR))
# Original dataset shape Counter({1: 616, 0: 616})

data_300 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_300%.csv", index_col=False)
data_300 = data_300.iloc[0:1713,:]
# Undersample the controls 
s1 = data_300.loc[data_300['Asthma_10YR'] == 1]
s0 = data_300.loc[data_300['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:704,]
data_300b = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300b = shuffle(data_300b, random_state=123)
print('Original dataset shape %s' % Counter(data_300b.Asthma_10YR))
# Original dataset shape Counter({1: 704, 0: 704})


# Assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0b);
data.append(data_25b);
data.append(data_50b);
data.append(data_100b);
data.append(data_150b);
data.append(data_200b);
data.append(data_250b);
data.append(data_300b)

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset.  
set = [0,1,2,3,4,5,6,7]

# Import preschool test data, standardised against the imputed preschool training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("Preschool_MICE_standardised_test_dataset_183IDs.csv", index_col=False)
del test['Unnamed: 0']
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']